TL;DR

Given the results of the analysis of the Fluidigm data, here I explore the role of the V intercept in a more complex data (Allen).

We want to check if the intercept acts as a size factor here too and, if true, if W capture some interesting biology.

Interestingly, in this more complex dataset, both intercepts are needed to get a clear picture of the data in two dimensions.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filter <- apply(assay(allen_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 11545

To speed up the computations, I will focus on the top 1,000 most variable genes.

allen_core <- allen_core[filter,]
core <- assay(allen_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        PC1         PC2 detection_rate   coverage
## PC1             1.00000000 -0.01052424      0.8066812  0.4453172
## PC2            -0.01052424  1.00000000     -0.3601716 -0.3286485
## detection_rate  0.80668124 -0.36017158      1.0000000  0.5275845
## coverage        0.44531717 -0.32864852      0.5275845  1.0000000

V intercept in both Mu and Pi

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 106.854  30.021  64.516

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_Vall@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_Vall@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topright", levels(cl), fill=cols2)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2   gamma_mu   gamma_pi
## W1              1.00000000  0.03514491 -0.0843733  0.3598978
## W2              0.03514491  1.00000000 -0.3676352  0.3710653
## gamma_mu       -0.08437330 -0.36763518  1.0000000 -0.3348686
## gamma_pi        0.35989777  0.37106533 -0.3348686  1.0000000
## detection_rate -0.35188976 -0.34026613  0.3712522 -0.9939701
## coverage       -0.05250198 -0.05649406  0.8504471 -0.4846195
##                detection_rate    coverage
## W1                 -0.3518898 -0.05250198
## W2                 -0.3402661 -0.05649406
## gamma_mu            0.3712522  0.85044711
## gamma_pi           -0.9939701 -0.48461953
## detection_rate      1.0000000  0.52758451
## coverage            0.5275845  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

No V intercept

print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
##    user  system elapsed 
## 120.871  30.694  59.448
par(mfrow=c(1, 2))
plot(zinb_Vnone@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_Vnone@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                          W1          W2 detection_rate   coverage
## W1              1.000000000 0.005426967     -0.4105128 -0.1328474
## W2              0.005426967 1.000000000      0.8736250  0.5865832
## detection_rate -0.410512821 0.873624999      1.0000000  0.5275845
## coverage       -0.132847434 0.586583172      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Mu

V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))

print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
##    user  system elapsed 
## 117.983  30.208  62.073
par(mfrow=c(1, 2))
plot(zinb_Vmu@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vmu@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2  gamma_mu detection_rate   coverage
## W1              1.00000000 -0.06020466 0.0485493     -0.2663277 -0.1083201
## W2             -0.06020466  1.00000000 0.5593379      0.8898885  0.3772103
## gamma_mu        0.04854930  0.55933792 1.0000000      0.5880261  0.9235989
## detection_rate -0.26632771  0.88988849 0.5880261      1.0000000  0.5275845
## coverage       -0.10832007  0.37721026 0.9235989      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Pi

print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
##    user  system elapsed 
## 152.040  40.292  80.801
par(mfrow=c(1, 2))
plot(zinb_Vpi@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vpi@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1         W2    gamma_pi detection_rate
## W1              1.00000000  0.1305085  0.09593638    -0.09884142
## W2              0.13050848  1.0000000 -0.37656124     0.27518468
## gamma_pi        0.09593638 -0.3765612  1.00000000    -0.98667190
## detection_rate -0.09884142  0.2751847 -0.98667190     1.00000000
## coverage        0.15676103 -0.2568486 -0.47896136     0.52758451
##                  coverage
## W1              0.1567610
## W2             -0.2568486
## gamma_pi       -0.4789614
## detection_rate  0.5275845
## coverage        1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

Do we need more than two dimensions?

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 121.030  32.532  61.484
pairs(zinb_3@W, col=cols[bio], pch=19)

pairs(zinb_3@W, col=cols2[cl], pch=19)

## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="allen.csv")

Using all genes

core <- assay(allen_core)
dim(core)
## [1] 11545   285
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
##     user   system  elapsed 
## 1162.914  161.906  572.368
par(mfrow=c(1, 2))
plot(zinb_all@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_all@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

Interestingly, when using all the genes, things don’t work nicely anymore. I wonder if the difference is the selection of the most variable genes, rather than the complexity of the data.

ZIFA (top 1000 most variable genes)

Full algorithm

import sys
sys.path.insert(1, '/usr/local/lib/python2.7/site-packages/')

from pandas import read_csv
from ZIFA import ZIFA
import numpy as np

df = read_csv('allen.csv')
del df['Unnamed: 0']
lc = np.array(df)
Y = np.transpose(lc)
Z, model_params  = ZIFA.fitModel(Y, 2)

np.savetxt('allen_zifa.csv', Z, delimiter=',')
## Running zero-inflated factor analysis with N = 285, D = 1000, K = 2
## Param change below threshold 1.000e-02 after 6 iterations
par(mfrow=c(1, 2))
zifa_res <- read.csv("allen_zifa.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")

wrapRzifa <- function(Y, block = F){
  d = digest(Y, "md5")
  tmp = paste0(tempdir(), '/', d)
  write.csv(Y, paste0(tmp, '.csv'))
  
  if (!file.exists(paste0(tmp, '_zifa.csv'))){
    print('run ZIFA')
    bb = ifelse(block, '-b ', '')
    cmd = sprintf('python run_zifa.py %s%s.csv %s_zifa.csv', bb, tmp, tmp)
    system(cmd)
  }
  read.csv(sprintf("%s_zifa.csv", tmp), header=FALSE)
}

Y = log2(core + 1)
zifa_res = wrapRzifa(Y)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")
df <- data.frame(Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         Z1         Z2 detection_rate   coverage
## Z1              1.00000000 0.07493093     -0.3890703 -0.2343373
## Z2              0.07493093 1.00000000      0.6859741  0.4423235
## detection_rate -0.38907034 0.68597407      1.0000000  0.5275845
## coverage       -0.23433727 0.44232350      0.5275845  1.0000000
df <- data.frame(W1 = zinb_Vall@W[,1],
                 W2 = zinb_Vall@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in both Mu and Pi')

print(cor(df, method="spearman"))
##             W1          W2          Z1          Z2
## W1  1.00000000  0.03514491  0.87737303 -0.06011083
## W2  0.03514491  1.00000000 -0.27471165 -0.77121359
## Z1  0.87737303 -0.27471165  1.00000000  0.07493093
## Z2 -0.06011083 -0.77121359  0.07493093  1.00000000
df <- data.frame(W1 = zinb_Vnone@W[,1],
                 W2 = zinb_Vnone@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'No intercept')

print(cor(df, method="spearman"))
##             W1           W2           Z1         Z2
## W1 1.000000000  0.005426967  0.940024468 0.11583976
## W2 0.005426967  1.000000000 -0.008162482 0.88366985
## Z1 0.940024468 -0.008162482  1.000000000 0.07493093
## Z2 0.115839757  0.883669851  0.074930925 1.00000000
df <- data.frame(W1 = zinb_Vmu@W[,1],
                 W2 = zinb_Vmu@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Mu')

print(cor(df, method="spearman"))
##             W1          W2          Z1         Z2
## W1  1.00000000 -0.06020466  0.93880730 0.27101971
## W2 -0.06020466  1.00000000 -0.21813102 0.85510842
## Z1  0.93880730 -0.21813102  1.00000000 0.07493093
## Z2  0.27101971  0.85510842  0.07493093 1.00000000
df <- data.frame(W1 = zinb_Vpi@W[,1],
                 W2 = zinb_Vpi@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Pi')

print(cor(df, method="spearman"))
##           W1        W2         Z1         Z2
## W1 1.0000000 0.1305085 0.88529862 0.37835330
## W2 0.1305085 1.0000000 0.13259032 0.63760776
## Z1 0.8852986 0.1325903 1.00000000 0.07493093
## Z2 0.3783533 0.6376078 0.07493093 1.00000000